In this report I add three different datasets to an sql database and join the table together. With this data I make three different graphs to show my skill in R and SQL.
First thing is loading all the packages I use
library(dslabs)
library(readr)
library(tidyverse)
library(here)
library(DBI)
library(plotly)
Here I load the data with the use of the “read_csv” command and the “here package”
# Laden van flu data en de eerste 10 rows skippen
flu_data <- read_csv(here("data","flu_data.csv"), skip = 10)
# Laden van denque data en de eerste 10 rows skippen
dengue_data <- read_csv(here("data","dengue_data.csv"), skip = 10)
# Laden van gampinder in gampinder (niet nuttig).
gapminder <- gapminder
Here a make the tables tidy, this for later use (is easier to work with tidy data).
# gapminder zelfde colnaam geven
gapminder_tidy <- gapminder %>% rename(Date = year)
# flu_data tidy maken
flu_data_tidy <- pivot_longer(data = flu_data, cols = -c("Date"), names_to = "country", values_to = "cases")
# en factor van country maken
flu_data_tidy$country <- as.factor(flu_data_tidy$country)
# dengue_data tidy maken
dengue_data_tidy <- pivot_longer(data = dengue_data, cols = -c("Date"), names_to = "country", values_to = "activity")
# en factor van country maken
dengue_data_tidy$country <- as.factor(dengue_data_tidy$country)
# de duplicate rows verwijderen
flu_data_tidy <- unique(flu_data_tidy[,1:3])
# de duplicate rows verwijderen
dengue_data_tidy <- unique(dengue_data_tidy[,1:3])
# Oplsaan als CSV bestand
write_csv(flu_data_tidy, path = here("data","flu_data_tidy.csv"))
# Oplsaan als CSV bestand
write_csv(dengue_data_tidy, path = here("data","dengue_data_tidy.csv"))
# Oplsaan als CSV bestand
write_csv(gapminder_tidy, path = here("data","gapminder_tidy.csv"))
# opslaan als rds bestand
write_rds(flu_data_tidy, path = here("data","flu_data_tidy.rds"))
# opslaan als rds bestand
write_rds(dengue_data_tidy, path = here("data","dengue_data_tidy.rds"))
# opslaan als rds bestand
write_rds(gapminder_tidy, path = here("data","gapminder_tidy.rds"))
# en table maken
CREATE TABLE flu_data (
Date VARCHAR(50),
country VARCHAR(50),
cases VARCHAR(50),
CONSTRAINT PK_flu PRIMARY KEY (Date,country)
);
# de date naar de table verplaatsen
COPY flu_data FROM 'C:/Users/Bas/Desktop/School/Programmeren/datascience/portfolio/data/flu_data_tidy.csv' WITH (FORMAT csv);
# de table laten zien
SELECT * FROM flu_data;
# en table maken
CREATE TABLE dengue_data (
Date VARCHAR(50),
country VARCHAR(50),
activity VARCHAR(50),
CONSTRAINT PK_dengue PRIMARY KEY (Date,country)
);
# de date naar de table verplaatsen
COPY dengue_data FROM 'C:/Users/Bas/Desktop/School/Programmeren/datascience/portfolio/data/dengue_data_tidy.csv' WITH (FORMAT csv);
# de table laten zien
SELECT * FROM dengue_data;
# connect to the database
con <- dbConnect(RPostgres::Postgres(),
dbname = "workflowsdb",
host="localhost",
port="5432",
user="postgres",
password="kaas")
# laat de tables zien
dbListTables(con)
## [1] "flu_data" "dengue_data" "gapminder"
# laat de colummen in flu_data zien
dbListFields(con, "flu_data")
## [1] "date" "country" "cases"
# laat het tabel flu_data zien
head(dbGetQuery(con, 'SELECT * FROM flu_data'))
## date country cases
## 1 Date country cases
## 2 2002-12-29 Argentina NA
## 3 2002-12-29 Australia NA
## 4 2002-12-29 Austria NA
## 5 2002-12-29 Belgium NA
## 6 2002-12-29 Bolivia NA
# disconnect van de database
dbDisconnect(con)
#create gapminder table
CREATE TABLE gapminder (
country VARCHAR(50),
Date VARCHAR(50),
infant_mortality VARCHAR(50) not null,
life_expectancy VARCHAR(50) not null,
fertitlity VARCHAR(50) not null,
population VARCHAR(50) not null,
gdp VARCHAR(50) not null,
continent VARCHAR(50),
region VARCHAR(50),
CONSTRAINT PK_gapminder PRIMARY KEY (Date,country)
);
#import the gampinder file
COPY gapminder FROM 'C:/Users/Bas/Desktop/School/Programmeren/datascience/portfolio/data/gapminder_tidy.csv' WITH (FORMAT csv);
# laat de tabel zien
SELECT * FROM gapminder;
# connect to the database
con <- dbConnect(RPostgres::Postgres(),
dbname = "workflowsdb",
host="localhost",
port="5432",
user="postgres",
password="kaas")
gapminder_flu <- dbGetQuery(con, 'select distinct *
from flu_data,gapminder
where flu_data.country = gapminder.country;'
)
# disconnect van de database
dbDisconnect(con)
# filter for the Netherlands and calculate the average cases over time
cases_netherlands <- gapminder_flu %>% filter(country == "Netherlands") %>% group_by(date=as.Date(date)) %>% summarise(mean_cases=mean(as.numeric(cases)))
# make a gg line plot
netherlands_graph <- cases_netherlands %>%
ggplot(aes(x = date, y = mean_cases)) +
geom_line() +
labs(
title = "flu cases over time in the netherlands",
y = "cases"
)
ggplotly(netherlands_graph)
# string naar date veranderen
gapminder_flu$date <- as.Date(gapminder_flu$date)
# nieuw object maken waar het gemmidelde cases is berekend in 2015 in europa
country_cases <- gapminder_flu %>% filter(between(date, as.Date("2015-01-01"), as.Date("2015-12-30"))) %>% filter(continent == "Europe") %>% group_by(country) %>% summarise(mean_cases=mean(as.numeric(cases)))
# ggplot maken
graph <-country_cases %>%
ggplot(aes(x = country, y = mean_cases, fill = country_cases$country)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(
title = "Average flu cases per europian country in 2015"
)
# ggplot in een plotly veranderen
ggplotly(graph)
# string naar date veranderen
gapminder_flu$date <- as.Date(gapminder_flu$date)
# nieuw object maken waar het gemmidelde life_expectancy is berekend in 2015 in europa
country_life <- gapminder_flu %>% filter(between(date, as.Date("2015-01-01"), as.Date("2015-12-30"))) %>% filter(continent == "Europe") %>% group_by(country) %>% summarise(mean_cases=mean(as.numeric(life_expectancy)))
# ggplot maken
graph_life <-country_life %>%
ggplot(aes(x = country, y = mean_cases, fill = country_cases$country)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(
title = "Average life expectancy per europian country in 2015"
)
# ggplot in een plotly veranderen
ggplotly(graph_life)